Outline:
1. Simple sentiment analysis of one play (Hamlet)
2. Same method on all Shakespeare’s tragedies 3. complex sentiment analysis
4. Simple temporal sentiment analysis I
5. Simple temporal sentiment analysis II

Install and load packages

if(!require("pacman")) install.packages("pacman")
## Loading required package: pacman
pacman::p_load(tidyverse,tidytext,glue,stringr,tm,zoo)

Sentiment analysis of one play (Hamlet)

First, we load in ‘Hamlet’ and do some cleaning of the text.
No need to remove any words.

text = glue(read_file("Hamlet.txt"))

tokens = tibble(text = tolower(text)) %>% unnest_tokens(word, text)

tokens
## # A tibble: 32,197 x 1
##    word    
##    <chr>   
##  1 act     
##  2 i       
##  3 scene   
##  4 i       
##  5 elsinore
##  6 a       
##  7 platform
##  8 before  
##  9 the     
## 10 castle  
## # ... with 32,187 more rows

We then do the following with our tokens:

  1. subset ‘sentiment words’ using the ‘bing’ lexicon
  2. count positive and negative words
  3. switch to wide format
  4. create a new variable calculating N positive - negative words
  5. switch back to long format (for plotting)

sentiments = tokens %>%
  inner_join(get_sentiments("bing")) %>%
  count(sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate("+/-" = positive - negative) %>%
  gather()

sentiments
## # A tibble: 3 x 2
##   key        value
##   <chr>      <dbl>
## 1 negative  1290  
## 2 positive  1233  
## 3 +/-      -  57.0

Now we can plot the results!

sentimentsBars = ggplot(sentiments, aes(x=key, y=value)) + 
  geom_bar(stat="identity",fill="darkred")

sentimentsBars

But what’s in ‘bing’ and the other available sentiment lexica?

set.seed(7693)
# bing
get_sentiments("bing")[sample(nrow(get_sentiments("bing")),10),]
## # A tibble: 10 x 2
##    word          sentiment
##    <chr>         <chr>    
##  1 mendacity     negative 
##  2 faultless     positive 
##  3 pitiless      negative 
##  4 fast-paced    positive 
##  5 superiority   positive 
##  6 resplendent   positive 
##  7 blasphemy     negative 
##  8 insensitively negative 
##  9 uproot        negative 
## 10 reproach      negative
# nrc
get_sentiments("nrc")[sample(nrow(get_sentiments("nrc")),10),]
## # A tibble: 10 x 2
##    word        sentiment   
##    <chr>       <chr>       
##  1 assessment  trust       
##  2 moribund    sadness     
##  3 villainous  disgust     
##  4 thirteenth  fear        
##  5 loom        anticipation
##  6 shot        surprise    
##  7 influential trust       
##  8 fancy       positive    
##  9 engaged     positive    
## 10 regression  negative
# afinn
get_sentiments("afinn")[sample(nrow(get_sentiments("afinn")),10),]
## # A tibble: 10 x 2
##    word       score
##    <chr>      <int>
##  1 degrade       -2
##  2 bereaved      -2
##  3 deceit        -3
##  4 prblm         -2
##  5 envy          -1
##  6 wrong         -2
##  7 dont like     -2
##  8 advantages     2
##  9 suicidal      -2
## 10 disparaged    -2
# loughran
get_sentiments("loughran")[sample(nrow(get_sentiments("loughran")),10),]
## # A tibble: 10 x 2
##    word           sentiment
##    <chr>          <chr>    
##  1 collapse       negative 
##  2 chattels       litigious
##  3 mistakes       negative 
##  4 exaggerating   negative 
##  5 enabling       positive 
##  6 antitrust      negative 
##  7 jurisdictional litigious
##  8 abrogate       litigious
##  9 delinquency    negative 
## 10 impair         negative
unique(get_sentiments("loughran")$sentiment)
## [1] "negative"     "positive"     "uncertainty"  "litigious"   
## [5] "constraining" "superfluous"

Sentiment analysis of all Shakespeare’s tragedies

playList = list.files(pattern=".txt")
df = tibble()

for (play in playList) {
  
  text = glue(read_file(play))
  tokens = tibble(text = tolower(text)) %>% unnest_tokens(word, text)

  sentiments = tokens %>%
    inner_join(get_sentiments("bing")) %>%
    count(sentiment) %>% 
    spread(sentiment, n, fill = 0) %>% 
    mutate("+/-" = positive - negative) %>%
    gather() %>%
    mutate(Play = play)
  
  df = rbind(df,sentiments)
}

sentimentsBarsAll = ggplot(df, aes(x=key, y=value)) + 
  geom_bar(stat="identity", fill = "darkred") +
  facet_wrap(~gsub(".txt","",Play))

sentimentsBarsAll

Complex sentiment analysis

Same code as above, except that we choose “nrc” instead of “bing”, and change the angle of the x-axis labels so that they all fit, i.e. adding:
theme(axis.text.x = element_text(angle = 45, hjust = 1))

Simple temporal sentiment analysis I

We first add row numbers and then create a variable called polarity with values -1 to 1 corresponding to “negative”" and “positive”.
We then use this variable to create another variable containing the “rolling mean” of 50 sentiment words.

for (play in playList){

  text=glue(read_file(play))

  tokens = tibble(text = tolower(text)) %>% unnest_tokens(word, text)
  tokens = rowid_to_column(tokens, "ID") # create row numbers
  
  sentiments = tokens %>%
  inner_join(get_sentiments("bing")) 
  
  sentiments$polarity = c()
  sentiments$polarity[sentiments$sentiment=="negative"] = -1
  sentiments$polarity[sentiments$sentiment=="positive"] =  1
  
  rollMean<-rollmean(sentiments$polarity, 50,fill = list(NA, NULL, NA))
  sentiments$rollMean=rollMean
  
  plot = ggplot(sentiments) +
    aes(ID,polarity, fill= sentiment) +
    geom_col() +
    geom_line(aes(ID,rollMean)) +
    ggtitle(gsub(".txt","",play)) +
    theme_minimal()
    print(plot)
}

Each bar represents a negative or positive word in chronological order.
The lines represent the rolling means.

Simple temporal sentiment analysis II

Here, we aggregate our data and compute a mean for every 30 observations.

for (play in playList){

text=glue(read_file(play))

tokens = tibble(text = tolower(text)) %>% unnest_tokens(word, text)
tokens = rowid_to_column(tokens, "ID")

sentiments = tokens %>%
  inner_join(get_sentiments("bing"))

sentiments$polarity = NULL
sentiments$polarity[sentiments$sentiment=="negative"] = -1
sentiments$polarity[sentiments$sentiment=="positive"] =  1

means=colMeans(matrix(sentiments$polarity, nrow=30))

df=tibble(row=seq(1:length(means)),means)

plot = df %>% 
  ggplot() +
  ggtitle(gsub(".txt","",play)) +
  theme_dark()+
  aes(row,means,fill=means) +
  geom_col() +
  ylim(-1,1) +
  scale_fill_gradient2(low = "red", mid = "white",
                       high = "blue", midpoint = 0, space = "Lab",
                       na.value = "grey50", guide = "colourbar")
  print(plot)
}

BONUS!

library(wordcloud2)

df = tibble()

for (play in playList) {
  
  text = glue(read_file(play))
  tokens = tibble(text = tolower(text)) %>% unnest_tokens(word, text)
  df=rbind(df,tokens)
}

sentiments = df %>%
  group_by(word) %>%
  count(word) %>% 
  inner_join(get_sentiments("bing")) %>%
  arrange(desc(n))

set.seed(1112)
wordcloud2(sentiments, figPath = "silh2.png", size = 1.5, color = "snow", backgroundColor="black")
knitr::include_graphics("sentimentCLoud.png")